{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "05c00e79-ed17-4afe-b231-e4e2ea0121d4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "train_data = pd.read_csv('用户新增预测挑战赛公开数据/train.csv')\n",
    "test_data = pd.read_csv('用户新增预测挑战赛公开数据/test.csv')\n",
    "\n",
    "train_data['common_ts'] = pd.to_datetime(train_data['common_ts'], unit='ms')\n",
    "test_data['common_ts'] = pd.to_datetime(test_data['common_ts'], unit='ms')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "42d9192c-eb77-43e1-8eeb-4da8ef6ceab6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def udmap_onethot(d):\n",
    "    v = np.zeros(9)\n",
    "    if d == 'unknown':\n",
    "        return v\n",
    "    \n",
    "    d = eval(d)\n",
    "    for i in range(1, 10):\n",
    "        if 'key' + str(i) in d:\n",
    "            v[i-1] = d['key' + str(i)]\n",
    "            \n",
    "    return v\n",
    "\n",
    "train_udmap_df = pd.DataFrame(np.vstack(train_data['udmap'].apply(udmap_onethot)))\n",
    "test_udmap_df = pd.DataFrame(np.vstack(test_data['udmap'].apply(udmap_onethot)))\n",
    "\n",
    "train_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]\n",
    "test_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dafa2fd8-45c7-4306-a9f1-637ec32b4bf6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data = pd.concat([train_data, train_udmap_df], axis=1)\n",
    "test_data = pd.concat([test_data, test_udmap_df], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ac7359a2-80e0-4f24-a56b-c5fe1c2def5b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data['eid_freq'] = train_data['eid'].map(train_data['eid'].value_counts())\n",
    "test_data['eid_freq'] = test_data['eid'].map(train_data['eid'].value_counts())\n",
    "\n",
    "train_data['eid_mean'] = train_data['eid'].map(train_data.groupby('eid')['target'].mean())\n",
    "test_data['eid_mean'] = test_data['eid'].map(train_data.groupby('eid')['target'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0b3c5362-df8d-4238-9d93-d3b4686786b8",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data['udmap_isunknown'] = (train_data['udmap'] == 'unknown').astype(int)\n",
    "test_data['udmap_isunknown'] = (test_data['udmap'] == 'unknown').astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "204a7cd1-73a4-4d1d-b900-ba9bec105efd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data['common_ts_hour'] = train_data['common_ts'].dt.hour\n",
    "test_data['common_ts_hour'] = test_data['common_ts'].dt.hour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "43fd2721-76ef-4453-943d-3cca2d86516c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Info] Number of positive: 87201, number of negative: 533155\n",
      "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022261 seconds.\n",
      "You can set `force_row_wise=true` to remove the overhead.\n",
      "And if memory is not enough, you can set `force_col_wise=true`.\n",
      "[LightGBM] [Info] Total Bins 1547\n",
      "[LightGBM] [Info] Number of data points in the train set: 620356, number of used features: 22\n",
      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.140566 -> initscore=-1.810596\n",
      "[LightGBM] [Info] Start training from score -1.810596\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LGBMClassifier()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LGBMClassifier</label><div class=\"sk-toggleable__content\"><pre>LGBMClassifier()</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LGBMClassifier()"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import lightgbm as lgb\n",
    "\n",
    "clf = lgb.LGBMClassifier()\n",
    "clf.fit(\n",
    "    train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),\n",
    "    train_data['target']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f0c9bb8c-7f2c-4031-9cb8-ff928bd4fc9e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pd.DataFrame({\n",
    "    'uuid': test_data['uuid'],\n",
    "    'target': clf.predict(test_data.drop(['udmap', 'common_ts', 'uuid'], axis=1))\n",
    "}).to_csv('submit.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d6c439e-3ba2-439d-8a44-4e84d1bfad81",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
